Welcome!

This document is a template workflow to show the machine learning development process. Included are the following tabs:

  • Data and features: A description of data, cleaning methodology and features

  • Tidymodels setup: A setup of model engines and feature engineering “recipes”

  • Tuning and Metrics: Executions summaries of model performance

  • Explain: Metrics to “explain” aspects of black box models

  • Map: A map to investigate if there is any geographic patterns of error

These are the parameters used in this specific run:

t<-params
class(t) <- "list"
sapply(t[!sapply(t, purrr::is_empty)], paste, collapse = ', ')%>%
  as.data.frame()%>%
  rename("value(s)" = 1)%>%
  reactable()

Data and features

Load data

# Call file to load data
source(file.path(here(),input_path,"load_data.R"),echo = TRUE, local = knitr::knit_global())
## 
## > # Load data
## > 
## > iris<-iris
## 
## > vis_miss(iris,warn_large_data = F)

Clean Data

See data cleaning code below.

# Call file to clean data
source(file.path(here(),input_path,"clean_data.R"),echo = TRUE, local = knitr::knit_global(), max.deparse.length=1e3)
## 
## > df<-
## +   iris %>% filter(Sepal.Length <7.1)
vis_miss(df,warn_large_data = F)

Below is the entire dataset (with all original columns), which you can feel free to filter and explore.

# Show top 50 rows
FormatReactable(head(df))

Features

See https://custom-scripts.sentinel-hub.com/custom-scripts/sentinel/sentinel-2/ for definition of some of the different remote sensing indices.

Our Outcome variable is Sepal.Length .

Our potential predictor variables are Sepal.Width, Petal.Length, Petal.Width, Species .

EDA

Below is some exploratory data analysis showing i) a histogram of all numeric columns, and ii) the frequencies of all categorical columns in the data

PlotNumericColumns(df%>%dplyr::select(predict_vars, outcome_var))
PlotCategoricalColumns(df%>%dplyr::select(predict_vars))


# Boxplot (x = year, y = total emisisons) Facet by crop

# cor_df<-cor(df%>%dplyr::select(where(is.numeric))%>%as_tibble())
# corrplot(cor_df)

Below is a correlation plot of all numeric variables

cor_df<-cor(df%>%dplyr::select(predict_vars,outcome_var)%>%dplyr::select(where(is.numeric))%>%as_tibble())
corrplot(cor_df, method = 'color', order = 'alphabet', tl.cex = 0.5)

Below is the relationship between each variable and the outcome variable

PlotNumericColumnsVOutcome(df%>%dplyr::select(predict_vars,outcome_var), outcome_var)
PlotCategoricalColumnsVOutcome(df%>%dplyr::select(predict_vars,outcome_var), outcome_var)

Below is all missing data.

# Missing data
vis_miss(df%>%dplyr::select(predict_vars, outcome_var), warn_large_data = F)

Tidymodels setup

To run these models, we will use the tidymodels framework. Each tab below represents a key step in developing a model. These steps will all come together to become a Workflowset that we are able to tune

To read more, see tidymodels.org.

Split data

Below is the code for how we split data into training and test. Note that we are also setting up 5-fold cross validation.

# browser()
# Split data into training and test set --------------------
source("scripts/training_test.R", echo = TRUE, local = knitr::knit_global())
## 
## > # Split data into training and test set --------------------
## > set.seed(123)
## 
## > df_split <-
## +   if(is.null(params$group_split_var)){
## +     print("Non-Group Split")
## +     initial_split(df,prop = params$split_pct, strata = params$ .... [TRUNCATED] 
## [1] "Non-Group Split"
## 
## > df_train <- training(df_split)
## 
## > df_test <- testing(df_split)
# Split training set into k-fold cross validation for evaluation later
source("scripts/folds.R",echo = TRUE, local = knitr::knit_global())
## 
## > # Split training set into k-fold cross validation for evaluation later
## > df_folds<- 
## +   if(is.null(params$group_split_var)){
## +     print("Non-Group ..." ... [TRUNCATED] 
## [1] "Non-Group CV"

Below are some visualizations that show how the test and training data differ

#Visualize training and test data
PlotNumericColumns(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
                     dplyr::select(predict_vars, outcome_var, dataset),
                   fill_column = "dataset")
PlotCategoricalColumns(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
                   dplyr::select(predict_vars, dataset),
                   fill_column = "dataset")


# df %>% ggplot(aes(x = area_ac, y = total_co2eq_kg_ac, colour = region)) +geom_point()

PlotNumericColumnsVOutcome(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
                     dplyr::select(predict_vars, outcome_var, dataset),
                     outcome = outcome_var,
                   colour_column = "dataset")
PlotCategoricalColumnsVOutcome(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
                   dplyr::select(predict_vars, dataset, outcome_var),
                   outcome = outcome_var,
                   fill_column = "dataset")

# Insert some custom EDA here.

# By fips code
# PlotCategoricalColumns(df_train%>%mutate(dataset = "train")%>%bind_rows(df_test%>%mutate(dataset = "test"))%>%
#                      dplyr::select(loc_fips, dataset),
#                    fill_column = "dataset")+
#   theme(axis.text=element_text(size=5))

Specs

Below are the specs (models) we are considering in our workflow.

source(file.path(here(),input_path,"specs.R"),echo = TRUE,local = knitr::knit_global())
## 
## > rf_spec<-
## +   rand_forest(trees = 1000,
## +               #The best mtry from tuning appeared to be 15
## +               mtry = 20,
## +               min_ .... [TRUNCATED] 
## 
## > rf_spec
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 20
##   trees = 1000
##   min_n = 2
## 
## Engine-Specific Arguments:
##   keep.inbag = F
##   importance = impurity
##   seed = 700
##   replace = T
## 
## Computational engine: ranger 
## 
## 
## > rf_spec_randomForest<-
## +   rand_forest(trees = 1000,
## +               #The best mtry from tuning appeared to be 15
## +               mtry = 10,
## +       .... [TRUNCATED] 
## 
## > rf_spec_randomForest
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 10
##   trees = 1000
## 
## Computational engine: randomForest 
## 
## 
## > xgb_spec <-
## +   boost_tree( tree_depth = 5,
## +               trees = 400,
## +               mode = "regression"
## +   ) %>%
## +   set_engine("xgboost")
## 
## > xgb_spec
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   trees = 400
##   tree_depth = 5
## 
## Computational engine: xgboost 
## 
## 
## > lm_spec <-
## +   linear_reg()
## 
## > lm_spec
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm 
## 
## 
## > knn_spec <-
## +   nearest_neighbor(neighbors = 8, weight_func = "triangular") %>%
## +   set_engine("kknn") %>%
## +   set_mode("regression")
## 
## > knn_spec
## K-Nearest Neighbor Model Specification (regression)
## 
## Main Arguments:
##   neighbors = 8
##   weight_func = triangular
## 
## Computational engine: kknn 
## 
## 
## > nnet_spec<-
## +   mlp()%>%
## +   set_mode("regression")%>%
## +   set_engine("nnet")
## 
## > nnet_spec
## Single Layer Neural Network Model Specification (regression)
## 
## Computational engine: nnet 
## 
## 
## > cubist_spec<-
## +   cubist_rules(
## +     mode = "regression",
## +     committees = NULL,
## +     neighbors = NULL,
## +     max_rules = NULL,
## +     engine = " ..." ... [TRUNCATED] 
## 
## > bart_spec<-
## +   parsnip::bart(trees = 236,
## +                 prior_terminal_node_coef =  0.953,
## +                 prior_terminal_node_expo = 0.788
## + .... [TRUNCATED] 
## 
## > bart_spec
## 
## Call:
## NULL
## 
## 
## > lasso_spec <- linear_reg(penalty = 0.1, mixture = 1)%>%
## +   set_engine("glmnet")
## 
## > lasso_spec
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.1
##   mixture = 1
## 
## Computational engine: glmnet 
## 
## 
## > stan_spec <-
## +   linear_reg() %>%
## +   set_engine("stan")
## 
## > stan_spec
## Linear Regression Model Specification (regression)
## 
## Computational engine: stan 
## 
## 
## > # Combine all specs into a list
## > model_list <-list("xgb" = xgb_spec,
## +                   "rf"= rf_spec,
## +                   "rfrandomForest" = rf_s .... [TRUNCATED]

Recipes

We may be considering different sets of predictors and/or different feature engineering steps. We define those below.

source(file.path(here(),input_path,"recipes.R"),echo = TRUE,local = knitr::knit_global())
## 
## > FeatureEngineering<- function(rec){
## +   rec%>%
## +     step_mutate(across(where(is.logical), as.numeric)) %>%
## +     # Remove categorical variables tha .... [TRUNCATED] 
## 
## > df_rec <-recipe(
## +   Formulate(y = outcome_var, x= predict_vars),
## +   data = df_train
## + )%>%
## +   FeatureEngineering()
## 
## > df_rec_width <-recipe(
## +   Formulate(y = outcome_var, x= c("Sepal.Width","Petal.Width")),
## +   data = df_train
## + )%>%
## +   FeatureEngineering()
## 
## > preproc_list<-list("main" = df_rec,
## +                    "mainwidth" = df_rec_width)

Workflowset

Recipes and Specs combine to create a workflowset, which is just an object that contains each model we will be training.

# Create workflowset
source("scripts/create_workflowset.R",echo = TRUE, local = knitr::knit_global())
## 
## > #Remove models we're not interested in running
## > model_list<-model_list[names(model_list) %in% params$models_to_run]
## 
## > #Remove recipes were not interested in
## > preproc_list<-preproc_list[names(preproc_list) %in% params$recipes_to_run]
## 
## > # This is where we define what's a part of the workflow set
## > df_wf_set <-
## +   workflow_set( # NOTE - ensure that preproc does not contain underscor .... [TRUNCATED]

In total there are 4 total models, which we summarize below:

df_wf_set%>%
  rowwise()%>%
  mutate(
        predictors = paste(info$workflow[[1]]$pre$actions$recipe$recipe$var_info%>%filter(role == "predictor")%>%pull(variable), collapse = "<br>"),
         outcome =  paste(info$workflow[[1]]$pre$actions$recipe$recipe$var_info%>%filter(role == "outcome")%>%pull(variable),   collapse = ","),
        outcome_predictors = paste("<b>Outcome</b>",
                           outcome,
                           "<br><b>Predictors</b>",
                           predictors,
                           sep = "<br>"),
        operations = paste(capture.output(info$workflow[[1]]$pre$actions$recipe$recipe)[-(1:10)], collapse = "<br><br>"),
         workflow_output = paste(capture.output(info$workflow, type = "output"), collapse = "<br>"))%>%
  dplyr::select(wflow_id, outcome_predictors:workflow_output)%>%
    reactable(
            columns = list(workflow_output = colDef(html = TRUE, width = 1000),
                           operations = colDef(html = TRUE),
                           outcome_predictors = colDef(html = TRUE, width = 250)

                           ),
            outlined = TRUE,
              highlight = TRUE,
              striped = TRUE,
              showSortable = TRUE,
              compact = FALSE,
              bordered = TRUE,
              defaultExpanded = F,
              filterable = TRUE,
              showPageSizeOptions = TRUE)

Tuning and Metrics

Workflowsets are first tuned in the code snippet below:

# browser()
#Run workflow set through cross validation ----
source("scripts/tune_cv.R",echo = TRUE, local = knitr::knit_global())
## 
## > registerDoParallel(cores = parallel::detectCores(logical = FALSE))
## 
## > grid_ctrl <-
## +   control_grid(
## +     save_pred = TRUE,
## +     parallel_over = "everything",
## +     save_workflow = FALSE
## +   )
## 
## > #Run workflow set through cross validation -----------------------
## > tuned_workflows<-workflow_map(
## +   df_wf_set%>%
## +     option_add(control = cont .... [TRUNCATED]
## Warning: There are existing options that are being modified
##  main_xgb: 'control'
##  main_rf: 'control'
##  mainwidth_xgb: 'control'
##  mainwidth_rf: 'control'
#t<-extract_workflow_set_result(tuned_workflows, "mainall_rf")

# autoplot(tuned_workflows)
# tuned_workflows[6,][["result"]][[1]][[".notes"]]

# Save tuned workflow to S3
# if (params$save_results){
#   WriteToS3(df = tuned_workflows,
#             s3name = paste0(params$run_name, "_tuned"),
#             type = "RDS",
#             obj = data_directory)
# }
# Combine mdoels with stacks
# https://stacks.tidymodels.org/articles/basics.html
#https://stacks.tidymodels.org/articles/workflowsets.html
# https://www.youtube.com/watch?v=HZXZxvdpDOg&t=2160s
df_wf_set_stack<-
  #Initiliaze the stacks
  stacks()%>%
  add_candidates(tuned_workflows)%>%
  # determine how to combine their predictions
  # Look deeper into setting up a meta model
  # include fewer models by proposing higher penalties
  blend_predictions(penalty = 10^seq(-2, -0.5, length = 20))%>%
  # fit the candidates with nonzero stacking coefficients
  fit_members()

# Relevant when tuning:
# collect_parameters(df_wf_set_stack, "rf")

# TODO: Save stacked model to S3

Tuned models are saved [HERE] on S3.

Tuned Models

ranked_results<-
  rank_results(tuned_workflows)%>%
    tidyr::extract(wflow_id, c("recipe", "model"), "([^_]+)_(.*)")


ranked_results%>%
  ggplot(aes(x = rank, y = mean, colour = model, shape = recipe))+
  geom_point()+
  geom_errorbar(aes(ymin = mean - std_err,
                    ymax = mean + std_err))+
  scale_colour_iwanthue() +
  scale_x_continuous(breaks= pretty_breaks())+
  labs(y="")+
  facet_wrap(~.metric, scales = "free")

# Plot of predictions from each fold
collect_predictions(tuned_workflows, summarize = F)%>%
  ggplot(aes(y = .pred, x = !!sym(outcome_var), colour = id))+
  geom_point(alpha = 0.1)+
  facet_wrap(~wflow_id)+
  scale_colour_iwanthue()+
  guides(colour = guide_legend(override.aes = list(alpha=1)))+
  labs(colour = "")

# V2
# collect_predictions(tuned_workflows, summarize = F)%>%
#   ggplot(aes(y = .pred, x = !!sym(outcome_var), colour = id))+
#   geom_point(alpha = 0.1)+
#   facet_grid(wflow_id~id)+
#   scale_colour_iwanthue()+
#   guides(colour = guide_legend(override.aes = list(alpha=1)))



FormatReactable(rank_results(tuned_workflows))
ensemble_predictions<- collect_predictions(tuned_workflows)

Stacking

After each model is trained, we can generate an ensemble model via stacking:

print(df_wf_set_stack)


autoplot(df_wf_set_stack)
# autoplot(df_wf_set_stack, type = "weights")

ensemble_preds<-
  df_test%>%
  mutate(predict(df_wf_set_stack, .))%>%
  mutate(ensemble_error = !!sym(outcome_var) - .pred)

member_preds_test <-
  df_test %>%
  dplyr::select(!!sym(outcome_var))%>%
  # Include all members as an additional
  bind_cols(predict(df_wf_set_stack, df_test, members = TRUE))

# Member predictions compared to ensemble
member_stats_test<-
  map_dfr(colnames(member_preds_test), rmse, truth = !!sym(outcome_var), data = member_preds_test) %>%
  mutate(member = colnames(member_preds_test))


# On training data
member_preds_train<-
  df_train %>%
  dplyr::select(!!sym(outcome_var))%>%
  # Include all members as an additional
  bind_cols(predict(df_wf_set_stack, df_train, members = TRUE))

member_stats_train<-
  map_dfr(colnames(member_preds_train), rmse, truth = !!sym(outcome_var), data = member_preds_train) %>%
  mutate(member = colnames(member_preds_train))


ensemble_preds%>%
  ggplot(aes(x = !!sym(outcome_var), y = .pred))+
  geom_point()+
  geom_abline(linetype = "dashed",
              color = "gray")+
  labs(x = "actual", y = "predicted", title = "Stacking Test Actual vs Predicted", subtitle= paste0("(RMSE: ", round(member_stats_test%>%filter(member == ".pred")%>%pull(.estimate), 2), ")"))+
  theme_bw()


FormatReactable(member_stats_test)
# Tune individual workflows
source("scripts/tune_workflows.R",echo = TRUE, local = knitr::knit_global())
## 
## > fits<-  lapply(df_wf_set$wflow_id,
## +                TuneExtractModel,
## +                wf =if(params$n_folds > 1){tuned_workflows}else{df_wf_set},
## + .... [TRUNCATED] 
## [1] "main_xgb"
## [1] "main_rf"
## → A | warning: 20 columns were requested but there were 5 predictors in the data. 5 will be used.
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x1
## [1] "mainwidth_xgb"
## [1] "mainwidth_rf"
## → A | warning: 20 columns were requested but there were 2 predictors in the data. 2 will be used.
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x1
## 
## > names(fits) <- df_wf_set$wflow_id
#example use:
#fits[['main_rf']][[2]]
# browser()

if (params$save_results){
  saveRDS(fits[[params$model_to_save]][[1]], paste0("~/downloads/", params$run_name,"_", params$model_to_save,"_fit",".RDS"))
  
  # WriteToS3(df = fits[[params$model_to_save]][[1]]%>%extract_fit_parsnip(),
  #           s3name = paste0(params$run_name,"_", params$model_to_save,"_fit"),
  #           type = "RDS",
  #           obj = data_directory)
}

RMSE Summary

Below is a summary of RMSEs across the training data, Cross-validated model set, and test set for all 4 models plus the ensemble.

rmses<-
  lapply(df_wf_set$wflow_id, function(e){

  tibble(m = e,
         test_rmse = fits[[e]][[3]],
         train_rmse = fits[[e]][[4]]
         )
})%>%
  bind_rows()

if(params$n_folds > 1){
  rmses<-
    rmses%>%
    left_join(
      ranked_results%>%
        mutate(m = paste0(recipe,"_", model))%>%
        filter(.metric == "rmse")%>%
        dplyr::select(m, "cv_rmse" = mean),
      by = "m"
    )
}

if(params$run_stack){
  rmses<-
    rmses %>%
  bind_rows(member_stats_test%>%
              filter(member == ".pred")%>%
              transmute(m = "ensemble", test_rmse = .estimate)%>%
              left_join(member_stats_train%>%
                          filter(member == ".pred")%>%
                          transmute(m = "ensemble",
                                    train_rmse = .estimate), by = "m")
            )%>%
  arrange(test_rmse)%>%
  mutate(rank = 1:n())
}

FormatReactable(rmses)

Below is a grid of how the errors across models are correlated

error_crossmodel<-
  lapply(df_wf_set$wflow_id, function(e){
    fits[[e]][[8]]%>%
      dplyr::select(.pred, error, model, !!sym(outcome_var))%>%
      mutate(wflow_id = e,
             id = 1:n())
})%>%
  bind_rows()%>%
  arrange(- !!sym(outcome_var))%>%
  filter(model == "Test")

final<-
  lapply(df_wf_set$wflow_id, function(e){
  error_crossmodel%>%
    mutate(m = e)%>%
    group_by(id)%>%
    mutate(val = error[wflow_id == e])%>%
    group_by(m,wflow_id)%>%
    summarise(mae = Metrics::mae(error,val),
              rmse = Metrics::rmse(error,val),
              cor = cor(error,val))
  })%>%
  bind_rows()%>%
  dplyr::select(m, wflow_id, cor)%>%
  spread(wflow_id, cor)%>%
  arrange(m)

FormatReactable(final)
gg1<-error_crossmodel%>%
  group_by(id)%>%
  mutate(max_error = max(error),
         min_error = min(error))%>%
    ggplot()+
    geom_point(aes(x = !!sym(outcome_var), y = error, colour = wflow_id), 
               alpha = 0.5, size = 0.5)+
   geom_segment(aes(y = max_error, yend= min_error,x = !!sym(outcome_var),xend=!!sym(outcome_var), group = id),
                linewidth =0.01)+
    geom_abline(slope = 0, colour = "red")+
    facet_wrap(~model, nrow = 1, scales = "free_x")+
    scale_colour_iwanthue()+
    labs(x = "Actual Value")

# Version 2
gg2<-error_crossmodel%>%
  group_by(id)%>%
  mutate(max_error = max(error),
         min_error = min(error))%>%
  ungroup()%>%
  mutate(id = fct_reorder(as.character(id), -error))%>%
    ggplot()+
    geom_point(aes(x = id, y = error, colour = wflow_id), 
               alpha = 0.5, size = 0.5)+
   geom_segment(aes(y = max_error, yend= min_error,x = id,xend= id, group = id),
                linewidth =0.01)+
    geom_abline(slope = 0, colour = "red")+
    facet_wrap(~model, nrow = 1, scales = "free_x")+
    scale_colour_iwanthue()+
    theme(axis.text.x=element_blank()) +
    labs(x = "Observation")


grid.arrange(gg1, gg2, nrow = 1)

GenerateChunk<-
  function(nm, m) {
    template<-
        c(
          "### {{nm}}\n",
          "```{r, echo = FALSE, fig.width=12, out.width='100%', dpi=300}\n",
          #"fits[['{{nm}}_{{m}}']][[1]]%>%extract_fit_engine()%>%print()\n",
          "fits[['{{nm}}_{{m}}']][[5]]\n",
          "fits[['{{nm}}_{{m}}']][[7]]\n",
          "fits[['{{nm}}_{{m}}']][[2]]\n",
          "fits[['{{nm}}_{{m}}']][[9]]\n",
          "fits[['{{nm}}_{{m}}']][[10]]\n",
          "```\n",
          "\n"
        )

    return(knitr::knit_expand(text = template))
}
text<- c()
for (a in params$models_to_run){

  text<-c(text, paste0("## ",a, "{.tabset}"))

  # Split into all the different recipes
  text<-
    c(text,
          lapply(params$recipes_to_run, GenerateChunk, m = a)%>%unlist()
    )
}

xgb

main

## NULL
## # A tibble: 1 × 1
##   .config             
##   <chr>               
## 1 Preprocessor1_Model1

mainwidth

## NULL
## # A tibble: 1 × 1
##   .config             
##   <chr>               
## 1 Preprocessor1_Model1

rf

main

## NULL
## # A tibble: 1 × 1
##   .config             
##   <chr>               
## 1 Preprocessor1_Model1

mainwidth

## NULL
## # A tibble: 1 × 1
##   .config             
##   <chr>               
## 1 Preprocessor1_Model1

Other info

You can use this chunk to customize some results! If you don’t provide an “other_chunks/other_chunks.Rmd” in your directory, this tab won’t show up!

# This is how to run TuneExtractModel (more for testing)

fit_xgb<-TuneExtractModel(tuned_wf =tuned_workflows,
                 string_model = "main_xgb",
                 training_data = df_train,
                 testing_data = df_test)

fit_xgb_best<- fit_xgb[[1]]
fit_xgb[[2]]

xgb_preds <-
    df_test %>%
    bind_cols(predict(fit_xgb_best, df_test))


grid.arrange(
  fit_xgb_best%>%
    pull_workflow_fit()%>%
    vip(geom = "col")
)
#Compare Models

# TODO: Turn into mini widget where you can compare the predictions of two models

xgb_rf<-
xgb_preds%>%
  rename("xgb_pred" = .pred)%>%
  mutate(xgb_error = !!sym(outcome_var) - xgb_pred )%>%
  left_join(rf_preds%>%dplyr::select(id_hash, "rf_pred" = .pred), by = "id_hash")

xgb_rf%>%
    ggplot(aes(x = xgb_pred, y = rf_pred, colour = xgb_error))+
    geom_point()+
    geom_abline(linetype = "dashed",
                color = "gray")+
    scale_colour_distiller(type = "div")

xgb_rf%>%
    ggplot(aes(x = time_harvest_year, y = !!sym(outcome_var), colour = xgb_error))+
    geom_point()+
    geom_abline(linetype = "dashed",
                color = "gray")+
    scale_colour_distiller(type = "div")


#### Test Errors ---------------------
#Difference in test error

  # What do all the ensemble observaitons looks like compared to individual models??
  error_df<-
    xgb_rf%>%
    mutate(rf_error = !!sym(outcome_var) - rf_pred)%>%
  left_join(ensemble_preds%>%dplyr::select(id_hash, "ensemble_pred" = .pred, ensemble_error), by = "id_hash")%>%
  arrange(-ensemble_error)%>%
  # Set order of plot to
  mutate(id_hash = factor(id_hash, levels = id_hash))%>%
  gather(var, val, contains("_error"))


# Error widget

    GenerateErrorBars(error_df, "total_emissions")
    GenerateErrorBars(error_df, "ipcc_climate")
    # GenerateErrorBars(error_df, "crop")
    # GenerateErrorBars(error_df, "rs_yield")

    

WidgetErrorBar = function(dataset) {

  library(shiny)
  vars = colnames(dataset)

  shinyApp(
    ui = fluidPage(
      fluidRow(style = "padding-bottom: 20px;",
        column(12, selectInput('fill_var', 'Fill Variable', vars, selected = "total_emissions"))
      ),
      fluidRow(
        plotlyOutput('error_bar_plot', height = "400px")
      )
    ),

    server = function(input, output, session) {

      # Combine the selected variables into a new data frame

      output$error_bar_plot = renderPlotly({
        ggplotly(GenerateErrorBars(error_df, input$fill_var))
      })
    },

    options = list(height = 500)
  )
}

# WidgetErrorBar(error_df)

Explain

Below is a set of diagnostics aimed to make our models more explainable. Note here that these are developed using the DALEX package.

For more information on Explainable ML, see the following resources:

Global

The following diagnostics are at the dataset (global) level, meaning they average behavior at the highest level of the machine learning algorithm. These methods are model-agnostic that describe average behavior of the model.

Note: All Global diagnostic plots from the DALEX package below are based on plugging in the training data (as opposed to the test data).

Model Diagnostics

# Model performance
lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  model_performance(explainer)

})%>%
  plot(geom = "histogram")
  #plot(resids_rf, resids_xgb, geom = "boxplot")


lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  model_diagnostics(explainer)
})%>%
  plot(variable = "y", yvariable = "residuals")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Variable Importance

# Dalex variable importance
v<-
  lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  variable_importance(explainer,  loss_function = loss_root_mean_square, B = 1)

})


v%>%plot(max_vars = 15)

model_vars <- v%>%
  bind_rows()%>%
  group_by(label)%>%
  filter(dropout_loss != dropout_loss[variable == "_full_model_"])%>%
  filter(variable!="_baseline_")%>%
  arrange(-dropout_loss)%>%
  pull(variable)%>%
  unique()

top_three_most_important_vars<-model_vars[1:3]

# single_variable(explainer_rf, variable = "rs_yield")

Dependence Profiles

Below is a set of plots defined below:

  • Partial Dependence Profile: “Let me show you what the model predicts on average when each data instance has the value v for that feature. I ignore whether the value v makes sense for all data instances.”
  • Local Dependence Profile: “Let me show you what the model predicts on average for data instances that have values close to v for that feature. The effect could be due to that feature, but also due to correlated features.”
  • Accumulated Dependence Profile: “Let me show you how the model predictions change in a small”window” of the feature around v for data instances in that window.”

Source: https://christophm.github.io/interpretable-ml-book/ale.html#motivation-and-intuition

#PDP
p1<-lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  model_profile(explainer, type = "partial",N = 50, variables = model_vars)

})%>%
  plot()

# Local Dependence Profiles
p2<-lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  model_profile(explainer, type = "conditional", N = 50, variables = model_vars)

})%>%
  plot()+
  ggtitle("Local Dependence profile")

# Accumulated Dependence Profiles
p3<-lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  model_profile(explainer, type = "accumulated", N = 50, variables = model_vars)

})%>%
  plot()

(p1 | p2 | p3)

# All 3 together
#TODO: Use objects from previous
lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  a<-model_profile(explainer, type = "accumulated", N = 50, variables = model_vars)
  a$agr_profiles$`_label_` = "accumulated local"

  b<-model_profile(explainer, type = "conditional", N = 50, variables = model_vars)
  b$agr_profiles$`_label_` = "local dependence"

  c<-model_profile(explainer, type = "partial", N = 50, variables = model_vars)
  c$agr_profiles$`_label_` = "partial dependence"

  plot(a,b,c)+
    ggtitle(paste0("Dependence Profiles (",e ,")"), subtitle = "")

})%>%
    grid.arrange(grobs = ., nrow = 1)

# plot(pdp_rf, variable_type = "categorical")

Local

The following diagnostics are at the instance (local) level, meaning they are at the prediction level of the machine learning algorithm. These methods are useful to break down and understand model predictions for a given data point.

In our diagnostics below, we will use the following data point(s):

# Choose an observation of interest
new_obs<-df_test%>%head(1)#%>%dplyr::select(predict_vars)

FormatReactable(new_obs)

Break Down Plot

Note that the ordering of breakdown plots matters.

# Break down plot
# Note that the ordering of breakdown matters and these are additive.

# lapply(params$models_to_explain, function(e){
# 
#   explainer<-fits[[e]][[6]]
#   predict_parts(explainer,
#                 new_observation = new_obs,
#                 type = "break_down")%>%
#     plot()
# })%>%
#   grid.arrange(grobs = .)

# Break down plot (with distributions)
lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  predict_parts(explainer,
                new_observation = new_obs,
                type = "break_down",
                keep_distributions = T,
                order = top_three_most_important_vars)%>%
    plot(plot_distributions = TRUE)
})%>%
  grid.arrange(grobs = .)



# Break down interactions
# Ordering also really matters here.
# lapply(params$models_to_explain, function(e){
#
#   explainer<-fits[[e]][[6]]
#   predict_parts(explainer,
#                 new_observation = new_obs,
#                 type = "break_down_interactions",
#                 interaction_preference = 2)%>%
#     plot()
# })%>%
#   grid.arrange(grobs = .)



# Shapley
# shap_rf <- predict_parts(explainer_rf, new_observation = new_obs, type = "shap", B = 50)
# plot(shap_rf)

Stability

# Ceteris Paribus Profile
# lapply(params$models_to_explain, function(e){
#
#   explainer<-fits[[e]][[6]]
#   predict_profile(explainer,
#                   new_observation = new_obs)
# })%>%
# plot(variables = c("rs_yield"),
#      color = "_label_"
#      )+
#   ggtitle("Ceteris-paribus profile", "")


# plot(cp_rf, variables = c("hist_prevcrop"),
#      variable_type = "categorical", categorical_type = "bars")
#   ggtitle("Ceteris-paribus profile", "")


#Local stability
p1<-lapply(params$models_to_explain, function(e){

  n<- 30

  explainer<-fits[[e]][[6]]
  predict_diagnostics(explainer,
                         new_obs,
                         neighbors = n)%>%
    plot()+
    ggtitle(paste0("Distribution of Residuals: ", e))
})%>%
  grid.arrange(grobs = .)

# Variable specific stability
p2<-lapply(params$models_to_explain, function(e){

  explainer<-fits[[e]][[6]]
  predict_diagnostics(explainer,
                         new_obs,
                         neighbors = 30,
                      variables = top_three_most_important_vars[1])%>%
    plot()
})%>%
  grid.arrange(grobs = .)

# grid.arrange(p1, p2, nrow = 1)

Summary

# Trying to put this on top of report:
# https://stackoverflow.com/questions/23570718/creating-summaries-at-the-top-of-a-knitr-report-that-use-variables-that-are-defi
# {r chunk-a, ref.label=c('tune_rf','param_table')}

  # rmses%>%
  # filter(rank == min(rank))%>%
  # rename("best_model" = m)%>%
  # FormatReactable(filt = F)